import os
import random

from gxl_ai_utils.utils import utils_file


# 切分成100小块
def do_cacat_for_k2_manifest(output_manifest_dir, num_split):
    temp_dir = os.path.join(output_manifest_dir, "temp")
    utils_file.makedir_sil(temp_dir)
    wav_manifest_name, text_manifest_name = utils_file.get_jsonl_filename4icefall("gxldata", "train")
    res_wav_jsonl_list = []
    res_text_jsonl_list = []
    for i in range(num_split):
        temp_dir_i = os.path.join(temp_dir, f'temp_{i}')
        wav_manifest_path = os.path.join(temp_dir_i, wav_manifest_name)
        text_manifest_path = os.path.join(temp_dir_i, text_manifest_name)
        wav_mani_list = utils_file.load_list_file_clean(wav_manifest_path)
        text_mani_list = utils_file.load_list_file_clean(text_manifest_path)
        res_wav_jsonl_list.extend(wav_mani_list)
        res_text_jsonl_list.extend(text_mani_list)

    output_wav_mani_path = os.path.join(output_manifest_dir, wav_manifest_name)
    output_text_mani_path = os.path.join(output_manifest_dir, text_manifest_name)
    random.shuffle(res_wav_jsonl_list)
    random.shuffle(res_text_jsonl_list)
    utils_file.write_list_to_file(res_wav_jsonl_list, output_wav_mani_path)
    utils_file.write_list_to_file(res_text_jsonl_list, output_text_mani_path)

